{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Noisy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This tutorial is available as an IPython notebook at [Malaya/example/noisy-translation](https://github.com/huseinzol05/Malaya/tree/master/example/noisy-translation).\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This module trained on both standard and local (included social media) language structures, so it is save to use for both.\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
      "CPU times: user 2.86 s, sys: 3.84 s, total: 6.7 s\n",
      "Wall time: 1.93 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import malaya\n",
    "import logging\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List available HuggingFace models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mesolitica/translation-t5-tiny-standard-bahasa-cased': {'Size (MB)': 139,\n",
       "  'Suggested length': 1536,\n",
       "  'en-ms chrF2++': 65.91,\n",
       "  'ms-en chrF2++': 61.3,\n",
       "  'ind-ms chrF2++': 58.15,\n",
       "  'jav-ms chrF2++': 49.33,\n",
       "  'pasar ms-ms chrF2++': 58.46,\n",
       "  'pasar ms-en chrF2++': 55.76,\n",
       "  'manglish-ms chrF2++': 51.04,\n",
       "  'manglish-en chrF2++': 52.2,\n",
       "  'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n",
       "  'to lang': ['en', 'ms']},\n",
       " 'mesolitica/translation-t5-small-standard-bahasa-cased': {'Size (MB)': 242,\n",
       "  'Suggested length': 1536,\n",
       "  'en-ms chrF2++': 67.37,\n",
       "  'ms-en chrF2++': 63.79,\n",
       "  'ind-ms chrF2++': 58.09,\n",
       "  'jav-ms chrF2++': 52.11,\n",
       "  'pasar ms-ms chrF2++': 62.49,\n",
       "  'pasar ms-en chrF2++': 60.77,\n",
       "  'manglish-ms chrF2++': 52.84,\n",
       "  'manglish-en chrF2++': 53.65,\n",
       "  'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n",
       "  'to lang': ['en', 'ms']},\n",
       " 'mesolitica/translation-t5-base-standard-bahasa-cased': {'Size (MB)': 892,\n",
       "  'Suggested length': 1536,\n",
       "  'en-ms chrF2++': 67.62,\n",
       "  'ms-en chrF2++': 64.41,\n",
       "  'ind-ms chrF2++': 59.25,\n",
       "  'jav-ms chrF2++': 52.86,\n",
       "  'pasar ms-ms chrF2++': 62.99,\n",
       "  'pasar ms-en chrF2++': 62.06,\n",
       "  'manglish-ms chrF2++': 54.4,\n",
       "  'manglish-en chrF2++': 54.14,\n",
       "  'from lang': ['en', 'ms', 'ind', 'jav', 'bjn', 'manglish', 'pasar ms'],\n",
       "  'to lang': ['en', 'ms']},\n",
       " 'mesolitica/translation-nanot5-tiny-malaysian-cased': {'Size (MB)': 205,\n",
       "  'Suggested length': 2048,\n",
       "  'en-ms chrF2++': 63.61,\n",
       "  'ms-en chrF2++': 59.55,\n",
       "  'ind-ms chrF2++': 56.38,\n",
       "  'jav-ms chrF2++': 47.68,\n",
       "  'mandarin-ms chrF2++': 36.61,\n",
       "  'mandarin-en chrF2++': 39.78,\n",
       "  'pasar ms-ms chrF2++': 58.74,\n",
       "  'pasar ms-en chrF2++': 54.87,\n",
       "  'manglish-ms chrF2++': 50.76,\n",
       "  'manglish-en chrF2++': 53.16,\n",
       "  'from lang': ['en',\n",
       "   'ms',\n",
       "   'ind',\n",
       "   'jav',\n",
       "   'bjn',\n",
       "   'manglish',\n",
       "   'pasar ms',\n",
       "   'mandarin',\n",
       "   'pasar mandarin'],\n",
       "  'to lang': ['en', 'ms']},\n",
       " 'mesolitica/translation-nanot5-small-malaysian-cased': {'Size (MB)': 358,\n",
       "  'Suggested length': 2048,\n",
       "  'en-ms chrF2++': 66.98,\n",
       "  'ms-en chrF2++': 63.52,\n",
       "  'ind-ms chrF2++': 58.1,\n",
       "  'jav-ms chrF2++': 51.55,\n",
       "  'mandarin-ms chrF2++': 46.09,\n",
       "  'mandarin-en chrF2++': 44.13,\n",
       "  'pasar ms-ms chrF2++': 63.2,\n",
       "  'pasar ms-en chrF2++': 59.78,\n",
       "  'manglish-ms chrF2++': 54.09,\n",
       "  'manglish-en chrF2++': 55.27,\n",
       "  'from lang': ['en',\n",
       "   'ms',\n",
       "   'ind',\n",
       "   'jav',\n",
       "   'bjn',\n",
       "   'manglish',\n",
       "   'pasar ms',\n",
       "   'mandarin',\n",
       "   'pasar mandarin'],\n",
       "  'to lang': ['en', 'ms']},\n",
       " 'mesolitica/translation-nanot5-base-malaysian-cased': {'Size (MB)': 990,\n",
       "  'Suggested length': 2048,\n",
       "  'en-ms chrF2++': 67.87,\n",
       "  'ms-en chrF2++': 64.79,\n",
       "  'ind-ms chrF2++': 56.98,\n",
       "  'jav-ms chrF2++': 51.21,\n",
       "  'mandarin-ms chrF2++': 47.39,\n",
       "  'mandarin-en chrF2++': 48.78,\n",
       "  'pasar ms-ms chrF2++': 65.06,\n",
       "  'pasar ms-en chrF2++': 64.03,\n",
       "  'manglish-ms chrF2++': 57.91,\n",
       "  'manglish-en chrF2++': 55.66,\n",
       "  'from lang': ['en',\n",
       "   'ms',\n",
       "   'ind',\n",
       "   'jav',\n",
       "   'bjn',\n",
       "   'manglish',\n",
       "   'pasar ms',\n",
       "   'mandarin',\n",
       "   'pasar mandarin'],\n",
       "  'to lang': ['en', 'ms']}}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.translation.available_huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1. tested on FLORES200 pair `dev` set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/flores200-eval\n",
      "2. tested on noisy test set, https://github.com/huseinzol05/malay-dataset/tree/master/translation/noisy-eval\n",
      "3. check out NLLB 200 metrics from `malaya.translation.nllb_metrics`.\n",
      "4. check out Google Translate metrics from `malaya.translation.google_translate_metrics`.\n"
     ]
    }
   ],
   "source": [
    "print(malaya.translation.info)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Improvements of new model\n",
    "\n",
    "1. able to translate `[en, ms, ind, jav, bjn, manglish, pasar ms, mandarin, pasar mandarin]` while old model only able to translate `[en, ms, pasar ms]`.\n",
    "2. No longer required `from_lang` part of the prefix.\n",
    "3. able to retain text structure as it is."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Transformer models\n",
    "\n",
    "```python\n",
    "def huggingface(\n",
    "    model: str = 'mesolitica/translation-t5-small-standard-bahasa-cased',\n",
    "    force_check: bool = True,\n",
    "    from_lang: List[str] = None,\n",
    "    to_lang: List[str] = None,\n",
    "    old_model: bool = False,\n",
    "    **kwargs,\n",
    "):\n",
    "    \"\"\"\n",
    "    Load HuggingFace model to translate.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    model: str, optional (default='mesolitica/translation-t5-small-standard-bahasa-cased')\n",
    "        Check available models at `malaya.translation.available_huggingface()`.\n",
    "    force_check: bool, optional (default=True)\n",
    "        Force check model one of malaya model.\n",
    "        Set to False if you have your own huggingface model.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: malaya.torch_model.huggingface.Translation\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "model = malaya.translation.huggingface(model = 'mesolitica/translation-nanot5-small-malaysian-cased')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Translate\n",
    "\n",
    "```python\n",
    "def generate(self, strings: List[str], to_lang: str = 'ms', **kwargs):\n",
    "    \"\"\"\n",
    "    Generate texts from the input.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    strings : List[str]\n",
    "    to_lang: str, optional (default='ms')\n",
    "        target language to translate.\n",
    "    **kwargs: vector arguments pass to huggingface `generate` method.\n",
    "        Read more at https://huggingface.co/docs/transformers/main_classes/text_generation\n",
    "\n",
    "        If you are using `use_ctranslate2`, vector arguments pass to ctranslate2 `translate_batch` method.\n",
    "        Read more at https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?highlight=translate_batch#ctranslate2.Translator.translate_batch\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[str]\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Noisy malay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = [\n",
    "    'ak tak paham la',\n",
    "    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',\n",
    "    \"Memanglah. Ini tak payah expert, aku pun tau. It's a gesture, bodoh.\",\n",
    "    'jam 8 di pasar KK memang org ramai 😂, pandai dia pilih tmpt.',\n",
    "    'Jadi haram jadah😀😃🤭',\n",
    "    'nak gi mana tuu',\n",
    "    'Macam nak ambil half day',\n",
    "    \"Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.\",\n",
    "    'mesolitica boleh buat asr tak',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Saya tidak faham',\n",
      " 'Hi guys! Saya perasan semalam dan hari ini ramai yang dapat cookies ni kan. '\n",
      " 'Jadi hari ini saya ingin berkongsi beberapa post mortem dari batch pertama '\n",
      " 'kami:',\n",
      " 'Memanglah. Ini tidak perlu pakar, saya juga tahu. Ini adalah isyarat, bodoh.',\n",
      " 'Jam 8 di pasar KK memang ramai orang 😂, pandai dia pilih tempat.',\n",
      " 'Jadi haram jadah 😀😃🤭',\n",
      " 'Ke mana kamu pergi?',\n",
      " 'Saya ingin mengambil separuh hari',\n",
      " 'Bayangkan PH dan menang dalam PRU-14. Kemudian terdapat pelbagai pintu '\n",
      " 'belakang. Akhirnya, Ismail Sabri naik. Itulah sebabnya saya tidak lagi '\n",
      " 'peduli tentang politik. Saya bersumpah saya sudah pergi.',\n",
      " 'Bolehkah mesolitica digunakan untuk membuat asr?']\n",
      "CPU times: user 34.7 s, sys: 47 ms, total: 34.8 s\n",
      "Wall time: 2.94 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"I don't understand\",\n",
      " 'Hi guys! I noticed that many people have received cookies yesterday and '\n",
      " 'today. So today I want to share some post mortem of our first batch:',\n",
      " \"Indeed. No need for an expert, I know. It's a gesture, stupid.\",\n",
      " \"At 8 o'clock in the KK market, it's crowded 😂, he's clever in choosing a \"\n",
      " 'place.',\n",
      " \"So it's illegal😀😃🤭\",\n",
      " 'Where are you going?',\n",
      " 'How to take half a day',\n",
      " 'Imagine PH and winning the 14th general election. Then there are all sorts '\n",
      " \"of backgazes. In the end, Ismail Sabri got in. That's why I don't care about \"\n",
      " \"politics anymore. I swear I'm already fucked up.\",\n",
      " 'Can the mesolitica make Asr?']\n",
      "CPU times: user 51.6 s, sys: 89.9 ms, total: 51.7 s\n",
      "Wall time: 4.41 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Manglish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = [\n",
    "    'i know plenty of people who snack on sambal ikan bilis.',\n",
    "    'I often visualize my own programs algorithm before implemment it.',\n",
    "    'Am I the only one who used their given name ever since I was a kid?',\n",
    "    'Gotta be wary of pimples. Oh they bleed bad when cut',\n",
    "    'Smh the dude literally has a rubbish bin infront of his house',\n",
    "    \"I think I won't be able to catch it within 1 min lol\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Saya kenal ramai orang yang makan sambal ikan bilis.',\n",
      " 'Saya sering memvisualisasikan algoritma program saya sendiri sebelum '\n",
      " 'mengimplemmennya.',\n",
      " 'Adakah saya seorang sahaja yang menggunakan nama mereka sejak saya masih '\n",
      " 'kecil?',\n",
      " 'Kena berhati-hati dengan jerawat. Oh, mereka berdarah teruk apabila dipotong',\n",
      " 'Sial, lelaki itu benar-benar mempunyai tong sampah di depan rumahnya.',\n",
      " 'Saya rasa saya tidak akan dapat menangkapnya dalam masa 1 minit lol']\n",
      "CPU times: user 6.07 s, sys: 9.2 ms, total: 6.08 s\n",
      "Wall time: 519 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['I know a lot of people who take snacks on sambal ikan bilis.',\n",
      " 'I often visualize my own program algorithm before impersonating it.',\n",
      " 'Am I the only one who has used their given name ever since I was a child?',\n",
      " 'You need to be cautious of pimples. Oh, they bleed badly when cut.',\n",
      " 'Oh my, the man is literally in a rubbish bin in front of his house.',\n",
      " \"I don't think I can catch it within 1 minute, haha\"]\n",
      "CPU times: user 8.09 s, sys: 4.09 ms, total: 8.1 s\n",
      "Wall time: 685 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Local Mandarin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = [\n",
    "    '某个角度漂亮，但我觉得不是很耐看。',\n",
    "    '就是暂时好看的意思咯？',\n",
    "    'i think, 有狐狸般的妖媚，确实是第一人选。'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Sudut yang cantik, tetapi saya rasa tidak begitu menarik.',\n",
      " 'Adakah ini bermaksud untuk sementara kelihatan cantik?',\n",
      " 'Saya rasa, mempunyai gadis-gadis yang sangat cantik dan cantik memang '\n",
      " 'menjadi pilihan pertama.']\n",
      "CPU times: user 4.25 s, sys: 2.94 ms, total: 4.26 s\n",
      "Wall time: 364 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'ms', max_length = 1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"A certain angle is beautiful, but I don't think it's very durable.\",\n",
      " 'Is it just for now good-looking?',\n",
      " 'I believe that having a fox-like and cute demeanor is indeed the first '\n",
      " 'choice.']\n",
      "CPU times: user 4.5 s, sys: 0 ns, total: 4.5 s\n",
      "Wall time: 378 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "pprint(model.generate(strings, to_lang = 'en', max_length = 1000))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
